Info

In this file, I will import all raw data and prepare for downstream data tidying, transformation, visualization, modeling and communication.

Load libraries

First, we load necessary libraries.

library(tidyverse) # A whole bunch of packages necessary for data analysis
library(readxl) # To load Excel files appropriately
library(haven) # To load SPSS files appropriately

Next, we move on to load data.

Clinical data

Data file

# SPSS file
data_clinic <- haven::read_sav(file = "../data/raw/noma_clinic.sav")

# Excel file
data_clinic <- readxl::read_xlsx(path = "../data/raw/noma_clinic.xlsx")

# Text file (tab-delimited)
data_clinic <- readr::read_delim(file = "../data/raw/noma_clinic.txt", delim = "\t", na = c("", "NA", "#NULL!"), locale = locale(decimal_mark = ",", encoding = "latin1"))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   initials = col_character(),
##   idmills = col_character(),
##   base_date = col_character(),
##   end_date = col_character()
## )
## See spec(...) for full column specifications.
# Text file (tab-delimited) -- shortcut
data_clinic <- readr::read_tsv(file = "../data/raw/noma_clinic.txt", na = c("", "NA", "#NULL!"), locale = locale(decimal_mark = ",", encoding = "latin1"))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   initials = col_character(),
##   idmills = col_character(),
##   base_date = col_character(),
##   end_date = col_character()
## )
## See spec(...) for full column specifications.
# CSV file (comma-separated) -- shortcut
data_clinic <- readr::read_csv(file = "../data/raw/noma_clinic.csv", na = c("", "NA", "#NULL!"), locale = locale(decimal_mark = ".", encoding = "latin1"))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   initials = col_character(),
##   idmills = col_character(),
##   base_date = col_character(),
##   end_date = col_character()
## )
## See spec(...) for full column specifications.
# Om this case, the direct SPSS load was superior
data_clinic <- haven::read_sav(file = "../data/raw/noma_clinic.sav")

# Look at the data
data_clinic

Many ways to Rome, indeed.

This looks nice (especially the direct SPSS load), but some adjustments are still necessary.

  1. R wants to work with factors for categorical variables. So for this particular SPSS file, we will change all haven_labelled variables to factors.

  2. Specify what each factor level should be, and set the level order.

data_clinic <- data_clinic %>% 
  modify_if(haven::is.labelled, haven::as_factor, levels = "both") %>% 
  mutate(
    
    # The group variable
    group = case_when(
      group == 0 ~ "control", 
      group == 1 ~ "intervention", 
      TRUE ~ NA_character_) %>% factor(levels = c("control", "intervention")), 
    
    # The gender variable
    gender = case_when(
      gender == "[2] kvinne" ~ "woman", 
      gender == "[1] mann" ~ "man", 
      TRUE ~ NA_character_) %>% factor(levels = c("woman", "man")), 
    
    # Hypertensive medication use
    hypertensive_med = case_when(
      hypertensive_med == 0 ~ "no", 
      hypertensive_med == 1 ~ "yes", 
      TRUE ~ NA_character_) %>% factor(levels = c("no", "yes")), 
    
    # Family history of myocardial infaction (before the age of 60 years)
    myocardialinfarction = case_when(
      myocardialinfarction == "[0] nei" ~ "no", 
      myocardialinfarction == "[1] Ja" ~ "yes", 
      myocardialinfarction == "[2] Vet ikke" ~ "unknown", 
      myocardialinfarction == "3" ~ "unknown", 
      TRUE ~ NA_character_) %>% factor(levels = c("no", "yes", "unknown")), 
    
    # Smoking status (multi-level)
    smoking = case_when(
      smoking == "[1] ja, regelmessig" ~ "regularly", 
      smoking == "[2] ja, men sjelden" ~ "seldom", 
      smoking == "[3] jeg har sluttet å røyke" ~ "quit", 
      smoking == "[4] aldri røkt" ~ "never", 
      smoking == "[5] bruker snus" ~ "snuff", 
      TRUE ~ NA_character_) %>% factor(levels = c("regularly", "seldom", "quit", "never", "snuff")), 
    
    # Smoking status (dichotomous: 1 + 2 + 5, or 3 + 4)
    smoking2 = case_when(
      smoking2 == 0 ~ "no", 
      smoking2 == 1 ~ "yes", 
      TRUE ~ NA_character_) %>% factor(levels = c("no", "yes"))
    )

data_clinic

This looks much better.

Finally, remove the variables that we don’t need, for example percentages and duplicate IDs.

data_clinic <- data_clinic %>% select(-matches("idmills|initials2|deltapercent_|deltaglucose_glucose"))

Annotation file

annotation_clinic <- read_xlsx(path = "../data/raw/annotation_clinic.xlsx")
annotation_clinic

Tidy & prep

Right now, the data files are loaded, but they are not tidy – We need to get the data files cleaned up before we can move on to explore further.

But what does tidy mean?

There are three interrelated rules which make a dataset tidy:

  1. Each variable must have its own column.
  2. Each observation must have its own row.
  3. Each value must have its own cell.

Lets start with the clinical data.

First variables that matches the word ‘base_’, ‘end_’ or ‘delta_’:

data_clinic_vars <- data_clinic %>% 
  select(id, matches("base_|end_|delta_|deltapercent_"), -matches("date")) %>%
  pivot_longer(-id, names_to = c("time", "variable"), names_sep = "_", values_to = "value") %>% 
  pivot_wider(names_from = "variable", values_from = "value")

data_clinic_vars

Then we do the ‘date’ variables:

data_clinic_date <- data_clinic %>% 
  select(id, matches("date")) %>% 
  pivot_longer(-id, names_to = c("time", "variable"), names_sep = "_", values_to = "value") %>% 
  pivot_wider(names_from = "variable", values_from = "value")

data_clinic_date

Join these two tibbles.

data_clinic <- data_clinic_vars %>% 
  left_join(data_clinic_date, by = c("id", "time")) %>%
  left_join(select(data_clinic, -matches("base_|end_|delta_")), by = "id", "time") %>% 
  select(id, initials, time, date, group:height, everything())

data_clinic

Nightingale data

Data file

Load Nightingale data directly from the raw data file.

data_nightingale <- readxl::read_xlsx(
  path = "../data/raw/noma-nightingale.xlsx", 
  sheet = 1, 
  skip = 11, 
  na = c("Zero (0)", "NA", "TAG", "NDEF"), 
  col_names = TRUE)
## New names:
## * `1` -> `1...2`
## * `1` -> `1...3`
## * `1` -> `1...4`
## * `1` -> `1...5`
## * `1` -> `1...6`
## * ... and 220 more problems
data_nightingale

We didnt get the column names, as the column names were stored in row 10 (and not row 12, which then automatically would have been set as column names). To get the column names, load the sheet again, but skip the first 9 rows, and then pull our the names/colnames. Finally, use this character string to set the names of the above-loaded Nightingale data frame.

colnames_nightingale <- readxl::read_xlsx(
  path = "../data/raw/noma-nightingale.xlsx", 
  sheet = 1, 
  skip = 9, 
  n_max = 1, 
  col_names = TRUE) %>% 
  names()

# This is what the column names look like
colnames_nightingale
##   [1] "sampleid"      "XXL-VLDL-P"    "XXL-VLDL-L"    "XXL-VLDL-PL"  
##   [5] "XXL-VLDL-C"    "XXL-VLDL-CE"   "XXL-VLDL-FC"   "XXL-VLDL-TG"  
##   [9] "XL-VLDL-P"     "XL-VLDL-L"     "XL-VLDL-PL"    "XL-VLDL-C"    
##  [13] "XL-VLDL-CE"    "XL-VLDL-FC"    "XL-VLDL-TG"    "L-VLDL-P"     
##  [17] "L-VLDL-L"      "L-VLDL-PL"     "L-VLDL-C"      "L-VLDL-CE"    
##  [21] "L-VLDL-FC"     "L-VLDL-TG"     "M-VLDL-P"      "M-VLDL-L"     
##  [25] "M-VLDL-PL"     "M-VLDL-C"      "M-VLDL-CE"     "M-VLDL-FC"    
##  [29] "M-VLDL-TG"     "S-VLDL-P"      "S-VLDL-L"      "S-VLDL-PL"    
##  [33] "S-VLDL-C"      "S-VLDL-CE"     "S-VLDL-FC"     "S-VLDL-TG"    
##  [37] "XS-VLDL-P"     "XS-VLDL-L"     "XS-VLDL-PL"    "XS-VLDL-C"    
##  [41] "XS-VLDL-CE"    "XS-VLDL-FC"    "XS-VLDL-TG"    "IDL-P"        
##  [45] "IDL-L"         "IDL-PL"        "IDL-C"         "IDL-CE"       
##  [49] "IDL-FC"        "IDL-TG"        "L-LDL-P"       "L-LDL-L"      
##  [53] "L-LDL-PL"      "L-LDL-C"       "L-LDL-CE"      "L-LDL-FC"     
##  [57] "L-LDL-TG"      "M-LDL-P"       "M-LDL-L"       "M-LDL-PL"     
##  [61] "M-LDL-C"       "M-LDL-CE"      "M-LDL-FC"      "M-LDL-TG"     
##  [65] "S-LDL-P"       "S-LDL-L"       "S-LDL-PL"      "S-LDL-C"      
##  [69] "S-LDL-CE"      "S-LDL-FC"      "S-LDL-TG"      "XL-HDL-P"     
##  [73] "XL-HDL-L"      "XL-HDL-PL"     "XL-HDL-C"      "XL-HDL-CE"    
##  [77] "XL-HDL-FC"     "XL-HDL-TG"     "L-HDL-P"       "L-HDL-L"      
##  [81] "L-HDL-PL"      "L-HDL-C"       "L-HDL-CE"      "L-HDL-FC"     
##  [85] "L-HDL-TG"      "M-HDL-P"       "M-HDL-L"       "M-HDL-PL"     
##  [89] "M-HDL-C"       "M-HDL-CE"      "M-HDL-FC"      "M-HDL-TG"     
##  [93] "S-HDL-P"       "S-HDL-L"       "S-HDL-PL"      "S-HDL-C"      
##  [97] "S-HDL-CE"      "S-HDL-FC"      "S-HDL-TG"      "XXL-VLDL-PL_%"
## [101] "XXL-VLDL-C_%"  "XXL-VLDL-CE_%" "XXL-VLDL-FC_%" "XXL-VLDL-TG_%"
## [105] "XL-VLDL-PL_%"  "XL-VLDL-C_%"   "XL-VLDL-CE_%"  "XL-VLDL-FC_%" 
## [109] "XL-VLDL-TG_%"  "L-VLDL-PL_%"   "L-VLDL-C_%"    "L-VLDL-CE_%"  
## [113] "L-VLDL-FC_%"   "L-VLDL-TG_%"   "M-VLDL-PL_%"   "M-VLDL-C_%"   
## [117] "M-VLDL-CE_%"   "M-VLDL-FC_%"   "M-VLDL-TG_%"   "S-VLDL-PL_%"  
## [121] "S-VLDL-C_%"    "S-VLDL-CE_%"   "S-VLDL-FC_%"   "S-VLDL-TG_%"  
## [125] "XS-VLDL-PL_%"  "XS-VLDL-C_%"   "XS-VLDL-CE_%"  "XS-VLDL-FC_%" 
## [129] "XS-VLDL-TG_%"  "IDL-PL_%"      "IDL-C_%"       "IDL-CE_%"     
## [133] "IDL-FC_%"      "IDL-TG_%"      "L-LDL-PL_%"    "L-LDL-C_%"    
## [137] "L-LDL-CE_%"    "L-LDL-FC_%"    "L-LDL-TG_%"    "M-LDL-PL_%"   
## [141] "M-LDL-C_%"     "M-LDL-CE_%"    "M-LDL-FC_%"    "M-LDL-TG_%"   
## [145] "S-LDL-PL_%"    "S-LDL-C_%"     "S-LDL-CE_%"    "S-LDL-FC_%"   
## [149] "S-LDL-TG_%"    "XL-HDL-PL_%"   "XL-HDL-C_%"    "XL-HDL-CE_%"  
## [153] "XL-HDL-FC_%"   "XL-HDL-TG_%"   "L-HDL-PL_%"    "L-HDL-C_%"    
## [157] "L-HDL-CE_%"    "L-HDL-FC_%"    "L-HDL-TG_%"    "M-HDL-PL_%"   
## [161] "M-HDL-C_%"     "M-HDL-CE_%"    "M-HDL-FC_%"    "M-HDL-TG_%"   
## [165] "S-HDL-PL_%"    "S-HDL-C_%"     "S-HDL-CE_%"    "S-HDL-FC_%"   
## [169] "S-HDL-TG_%"    "VLDL-D"        "LDL-D"         "HDL-D"        
## [173] "Serum-C"       "VLDL-C"        "Remnant-C"     "LDL-C"        
## [177] "HDL-C"         "HDL2-C"        "HDL3-C"        "EstC"         
## [181] "FreeC"         "Serum-TG"      "VLDL-TG"       "LDL-TG"       
## [185] "HDL-TG"        "TotPG"         "TG/PG"         "PC"           
## [189] "SM"            "TotCho"        "ApoA1"         "ApoB"         
## [193] "ApoB/ApoA1"    "TotFA"         "UnSat"         "DHA"          
## [197] "LA"            "FAw3"          "FAw6"          "PUFA"         
## [201] "MUFA"          "SFA"           "DHA/FA"        "LA/FA"        
## [205] "FAw3/FA"       "FAw6/FA"       "PUFA/FA"       "MUFA/FA"      
## [209] "SFA/FA"        "Glc"           "Lac"           "Cit"          
## [213] "Ala"           "Gln"           "His"           "Ile"          
## [217] "Leu"           "Val"           "Phe"           "Tyr"          
## [221] "Ace"           "AcAce"         "bOHBut"        "Crea"         
## [225] "Alb"           "Gp"
# Add to data frame
data_nightingale <- data_nightingale %>% set_names(colnames_nightingale)

data_nightingale

This looks much better.

Note also that there are a lot of zero’s in the data frame. These are probably the largest VLDL particles and the specific lipid species. Nightingale has very sensitive instruments, but they obviously cannot measure a XXL-VLDL subclass measure to exactly zero. These should therefore be NA/missing.

data_nightingale %>% 
  summarize_all(~sum(. == 0)) %>% 
  pivot_longer(cols = names(.), names_to = "variable", values_to = "value") %>% 
  arrange(desc(value))

Indeed. We will change these to missing/NA.

data_nightingale <- data_nightingale %>% 
  mutate_at(vars(colnames_nightingale[-1]), ~na_if(x = ., y = 0))

Annotation object

To create the Nightingale annotation file, we pull out the few rows containing important annotation information, and use that as a starting point.

annotation_nightingale <- readxl::read_xlsx(path = "../data/raw/noma-nightingale.xlsx", skip = 8, n_max = 3) %>% 
  mutate(name.full = case_when(
    `...1` == "sampleid" ~ "name.short", 
    is.na(`...1`) ~ "unit", 
    `...1` == "success %" ~ "success_%")) %>% 
  select(name.full, everything(), -`...1`) %>% 
  as.data.frame() %>% column_to_rownames(var = "name.full") %>% t() %>% 
  as.data.frame() %>% rownames_to_column(var = "name.full") %>% as_tibble() %>% 
  mutate_all(as.character) %>% 
  mutate(`success_%` = as.numeric(`success_%`))
## New names:
## * `` -> ...1
annotation_nightingale

Good – this is a nice starting point.

Next we subset and specify information from the existing variables, and create new annotation variables while at it.

annotation_nightingale <- annotation_nightingale %>%
  mutate(
    
    # Group variables into'types'; important for faceting
    type = case_when(
      str_detect(name.short, pattern = "-P$") ~ "Particle concentration", 
      str_detect(name.short, pattern = "-L$") ~ "Total lipids", 
      str_detect(name.short, pattern = "-PL$|-PL_%$|TotPG|TG/PG") ~ "Phospholipids", 
      str_detect(name.short, pattern = "-C$|-C_%$|EstC|FreeC") ~ "Cholesterol", 
      str_detect(name.short, pattern = "-CE$|-CE_%$") ~ "Cholesterol esters", 
      str_detect(name.short, pattern = "-FC$|-FC_%$") ~ "Free cholesterol", 
      str_detect(name.short, pattern = "-TG$|-TG_%$") ~ "Triglycerides", 
      str_detect(name.short, pattern = "-D$") ~ "Size", 
      str_detect(name.short, pattern = "PC|SM|TotCho") ~ "Phospholipids", 
      str_detect(name.short, pattern = "Apo") ~ "Apolipoproteins", 
      str_detect(name.full, pattern = " acid|unsaturation") ~ "Fatty acids", 
      str_detect(name.short, pattern = "Glc|Lac|Cit|^Ace$") ~ "Glucose metabolism", 
      str_detect(name.short, pattern = "Ile|Leu|Val") ~ "AA, branched-chain", 
      str_detect(name.short, pattern = "Phe|Tyr|His") ~ "AA, aromatic", 
      str_detect(name.short, pattern = "Ala|Gln") ~ "AA, other", 
      str_detect(name.short, pattern = "^AcAce$|bOHBut") ~ "Ketone bodies", 
      str_detect(name.short, pattern = "Crea|Alb|Gp") ~ "Miscellaneous", 
      TRUE ~ NA_character_
      ) %>% 
      
      # Coerce to factor and set the levels (meaning: specify the order in which panels/facets should be displayed)
      factor(levels = c(
        "Particle concentration", 
        "Apolipoproteins", 
        "Size", 
        "Total lipids", 
        "Cholesterol", 
        "Cholesterol esters", 
        "Free cholesterol", 
        "Triglycerides", 
        "Phospholipids", 
        "Fatty acids", 
        "Glucose metabolism", 
        "AA, branched-chain", 
        "AA, aromatic", 
        "AA, other", 
        "Ketone bodies", 
        "Miscellaneous")), 
    
    type.short = case_when(
      type == "Particle concentration" ~ "Particles", 
      type == "Apolipoproteins" ~ "Apos", 
      type == "Size" ~ "Size", 
      type == "Total lipids" ~ "Lipids", 
      type == "Cholesterol" ~ "Chol", 
      type == "Cholesterol esters" ~ "CEs", 
      type == "Free cholesterol" ~ "FCs", 
      type == "Triglycerides" ~ "TGs", 
      type == "Phospholipids" ~ "PLs", 
      type == "Fatty acids" ~ "FAs", 
      type == "Glucose metabolism" ~ "Glucose", 
      type == "AA, branched-chain" ~ "AA, br-ch", 
      type == "AA, aromatic" ~ "AA, arom", 
      type == "AA, other" ~ "AA, other", 
      type == "Ketone bodies" ~ "Ketones", 
      type == "Miscellaneous" ~ "Misc", 
      TRUE ~ NA_character_
      ) %>% 
      factor(levels = c(
        "Particles", "Apos", "Size", "Lipids", "Chol", "CEs", "FCs", "TGs", "PLs", 
        "FAs", "Glucose", "AA, br-ch", "AA, arom", "AA, other", "Ketones", "Misc"
      )), 
    
    # Pull out the subclass
    class = case_when(
      str_detect(name.short, pattern = "VLDL") ~ "VLDL", 
      str_detect(name.short, pattern = "LDL") ~ "LDL", 
      str_detect(name.short, pattern = "IDL") ~ "IDL", 
      str_detect(name.short, pattern = "HDL") ~ "HDL", 
      TRUE ~ NA_character_
    ), 
    
    # Pull out the subclass size
    size = case_when(
      str_detect(name.short, pattern = "^XXL-") ~ "XXL", 
      str_detect(name.short, pattern = "^XL-") ~ "XL", 
      str_detect(name.short, pattern = "^L-") ~ "L", 
      str_detect(name.short, pattern = "^M-") ~ "M", 
      str_detect(name.short, pattern = "IDL") ~ "M", 
      str_detect(name.short, pattern = "^S-") ~ "S", 
      str_detect(name.short, pattern = "^XS-") ~ "XS", 
      str_detect(name.short, pattern = "^XXS-") ~ "XXS", 
      TRUE ~ NA_character_
    ), 
    
    # Create a 'lipid class' variable
    lipid.subclasses = case_when(
      !is.na(size) | class == "IDL" ~ "yes", 
      TRUE ~ "no"
    ), 
    
    # Create a 'pretty' variable
    name.pretty = case_when(
      class == "IDL" ~ "IDL", 
      !is.na(class) & !is.na(size) ~ paste0(size, "-", class), 
      type == "Size" ~ class, 
      type == "Cholesterol" & is.na(size) ~ name.short, 
      type == "Triglycerides" & is.na(size) ~ name.short, 
      type == "Phospholipids" & is.na(size) ~ name.short, 
      type == "Apolipoproteins" & is.na(size) ~ name.short, 
      type == "Fatty acids" & is.na(size) ~ name.short, 
      type == "Glucose metabolism" & is.na(size) ~ name.full, 
      type == "AA, other" & is.na(size) ~ name.full, 
      type == "AA, aromatic" & is.na(size) ~ name.full, 
      type == "AA, branched-chain" & is.na(size) ~ name.full, 
      type == "Ketone bodies" & is.na(size) ~ name.full, 
      name.short == "Gp" ~ "Gp-acetyls", 
      type == "Miscellaneous" & is.na(size) ~ name.full, 
      TRUE ~ NA_character_
      ), 
    
    name.pretty.unit = case_when(
      class == "IDL" ~ paste0("IDL", " (", unit, ")"), 
      !is.na(class) & !is.na(size) ~ paste0(size, "-", class, " (", unit, ")"), 
      type == "Size" ~ paste0(class, " (", unit, ")"), 
      type == "Cholesterol" & is.na(size) ~ paste0(name.short, " (", unit, ")"), 
      type == "Triglycerides" & is.na(size) ~ paste0(name.short, " (", unit, ")"), 
      type == "Phospholipids" & is.na(size) ~ paste0(name.short, " (", unit, ")"), 
      type == "Apolipoproteins" & is.na(size) ~ paste0(name.short, " (", unit, ")"), 
      type == "Fatty acids" & is.na(size) ~ paste0(name.short, " (", unit, ")"), 
      type == "Glucose metabolism" & is.na(size) ~ paste0(name.full, " (", unit, ")"), 
      type == "AA, other" & is.na(size) ~ paste0(name.full, " (", unit, ")"), 
      type == "AA, aromatic" & is.na(size) ~ paste0(name.full, " (", unit, ")"), 
      type == "AA, branched-chain" & is.na(size) ~ paste0(name.full, " (", unit, ")"), 
      type == "Ketone bodies" & is.na(size) ~ paste0(name.full, " (", unit, ")"), 
      name.short == "Gp" ~ paste0("Gp-acetyls", " (", unit, ")"), 
      type == "Miscellaneous" & is.na(size) ~ paste0(name.full, " (", unit, ")"), 
      TRUE ~ NA_character_
      ), 
    
    # Change unit NAs to 'ratio'
    unit = case_when(is.na(unit) ~ "ratio", TRUE ~ unit), 
    
    # This is the general variable order; any changes can be made using fct_relevel
    name.order = row_number()
  )

annotation_nightingale

Great stuff!

Tidy & prep

data_nightingale <- data_nightingale %>% 
  separate(sampleid, into = c("oslo", "noma.id", "id_time"), sep = "_") %>% 
  separate(id_time, into = c("id", "time"), sep = "-") %>% 
  mutate(
    time = case_when(
      time == "V0" ~ "base", 
      time == "V4" ~ "end", 
      TRUE ~ NA_character_) %>% 
        factor(levels = c("base", "end")), 
    id = as.numeric(id)) %>% 
  select(-oslo, -noma.id)

data_nightingale

This looks very nice. The tibble is ready to join with the clinical data. But first, let’s calculate the delta/change value for each variable, for each participant.

Calculate delta

data_nightingale <- data_nightingale %>% 
  pivot_longer(-c(id, time), names_to = "variables", values_to = "value") %>% 
  pivot_wider(names_from = "time", values_from = "value") %>% 
  mutate(delta = end - base) %>% 
  pivot_longer(-c(id, variables), names_to = "time", values_to = "value") %>% 
  pivot_wider(names_from = "variables", values_from = "value")

data_nightingale

Well, that was easy!

Join

Finally, we are ready to do the full join of clinical data and Nightingale data.

data_comb <- left_join(data_clinic, data_nightingale, by = c("id", "time"))

Misc

Baseline clinical data

Prepare a smaller data frame with only baseline values, to be used in the 01-basics script.

data_noma <- data_clinic %>% 
  filter(time == "base") %>% 
  mutate(overweight = if_else(bmi > 25, true = "Overweight", "Not overweight")) %>% 
  select(-time)

Variable vector list

Prepare some variable vectors that might come in handy later on. Put all vectors in a list for each of access.

drop <- c("id", "initials", "time", "date", "group")
variables <- list(
  "clinic" = data_clinic %>% select(-one_of(drop)) %>% names(), 
  "clinic_num" = data_clinic %>% select(-one_of(drop)) %>% select_if(is.numeric) %>% names(), 
  "clinic_fac" = data_clinic %>% select(-one_of(drop)) %>% select_if(is.factor) %>% names(), 
  "nightingale" = data_nightingale %>% select(-one_of(drop)) %>% names()
  )

variables
## $clinic
##  [1] "age"                  "gender"               "hypertensive_med"    
##  [4] "myocardialinfarction" "smoking"              "smoking2"            
##  [7] "height"               "weight"               "bmi"                 
## [10] "waist"                "hip"                  "whratio"             
## [13] "sbp"                  "dbp"                  "pulse"               
## [16] "totalfatpercent"      "totalfatweight"       "fatfreeweight"       
## [19] "leuko"                "neutro"               "lymph"               
## [22] "mono"                 "eosino"               "creatinine"          
## [25] "asat"                 "alat"                 "ggt"                 
## [28] "alp"                  "hba1c"                "glucose"             
## [31] "insulin"              "tg"                   "tc"                  
## [34] "hdlc"                 "ldlc"                 "ldlhdlratio"         
## [37] "apoa1"                "apob"                 "aporatio"            
## [40] "lpa"                  "crp"                  "tsh"                 
## [43] "vitd"                 "vitd3"               
## 
## $clinic_num
##  [1] "age"             "height"          "weight"         
##  [4] "bmi"             "waist"           "hip"            
##  [7] "whratio"         "sbp"             "dbp"            
## [10] "pulse"           "totalfatpercent" "totalfatweight" 
## [13] "fatfreeweight"   "leuko"           "neutro"         
## [16] "lymph"           "mono"            "eosino"         
## [19] "creatinine"      "asat"            "alat"           
## [22] "ggt"             "alp"             "hba1c"          
## [25] "glucose"         "insulin"         "tg"             
## [28] "tc"              "hdlc"            "ldlc"           
## [31] "ldlhdlratio"     "apoa1"           "apob"           
## [34] "aporatio"        "lpa"             "crp"            
## [37] "tsh"             "vitd"            "vitd3"          
## 
## $clinic_fac
## [1] "gender"               "hypertensive_med"     "myocardialinfarction"
## [4] "smoking"              "smoking2"            
## 
## $nightingale
##   [1] "XXL-VLDL-P"    "XXL-VLDL-L"    "XXL-VLDL-PL"   "XXL-VLDL-C"   
##   [5] "XXL-VLDL-CE"   "XXL-VLDL-FC"   "XXL-VLDL-TG"   "XL-VLDL-P"    
##   [9] "XL-VLDL-L"     "XL-VLDL-PL"    "XL-VLDL-C"     "XL-VLDL-CE"   
##  [13] "XL-VLDL-FC"    "XL-VLDL-TG"    "L-VLDL-P"      "L-VLDL-L"     
##  [17] "L-VLDL-PL"     "L-VLDL-C"      "L-VLDL-CE"     "L-VLDL-FC"    
##  [21] "L-VLDL-TG"     "M-VLDL-P"      "M-VLDL-L"      "M-VLDL-PL"    
##  [25] "M-VLDL-C"      "M-VLDL-CE"     "M-VLDL-FC"     "M-VLDL-TG"    
##  [29] "S-VLDL-P"      "S-VLDL-L"      "S-VLDL-PL"     "S-VLDL-C"     
##  [33] "S-VLDL-CE"     "S-VLDL-FC"     "S-VLDL-TG"     "XS-VLDL-P"    
##  [37] "XS-VLDL-L"     "XS-VLDL-PL"    "XS-VLDL-C"     "XS-VLDL-CE"   
##  [41] "XS-VLDL-FC"    "XS-VLDL-TG"    "IDL-P"         "IDL-L"        
##  [45] "IDL-PL"        "IDL-C"         "IDL-CE"        "IDL-FC"       
##  [49] "IDL-TG"        "L-LDL-P"       "L-LDL-L"       "L-LDL-PL"     
##  [53] "L-LDL-C"       "L-LDL-CE"      "L-LDL-FC"      "L-LDL-TG"     
##  [57] "M-LDL-P"       "M-LDL-L"       "M-LDL-PL"      "M-LDL-C"      
##  [61] "M-LDL-CE"      "M-LDL-FC"      "M-LDL-TG"      "S-LDL-P"      
##  [65] "S-LDL-L"       "S-LDL-PL"      "S-LDL-C"       "S-LDL-CE"     
##  [69] "S-LDL-FC"      "S-LDL-TG"      "XL-HDL-P"      "XL-HDL-L"     
##  [73] "XL-HDL-PL"     "XL-HDL-C"      "XL-HDL-CE"     "XL-HDL-FC"    
##  [77] "XL-HDL-TG"     "L-HDL-P"       "L-HDL-L"       "L-HDL-PL"     
##  [81] "L-HDL-C"       "L-HDL-CE"      "L-HDL-FC"      "L-HDL-TG"     
##  [85] "M-HDL-P"       "M-HDL-L"       "M-HDL-PL"      "M-HDL-C"      
##  [89] "M-HDL-CE"      "M-HDL-FC"      "M-HDL-TG"      "S-HDL-P"      
##  [93] "S-HDL-L"       "S-HDL-PL"      "S-HDL-C"       "S-HDL-CE"     
##  [97] "S-HDL-FC"      "S-HDL-TG"      "XXL-VLDL-PL_%" "XXL-VLDL-C_%" 
## [101] "XXL-VLDL-CE_%" "XXL-VLDL-FC_%" "XXL-VLDL-TG_%" "XL-VLDL-PL_%" 
## [105] "XL-VLDL-C_%"   "XL-VLDL-CE_%"  "XL-VLDL-FC_%"  "XL-VLDL-TG_%" 
## [109] "L-VLDL-PL_%"   "L-VLDL-C_%"    "L-VLDL-CE_%"   "L-VLDL-FC_%"  
## [113] "L-VLDL-TG_%"   "M-VLDL-PL_%"   "M-VLDL-C_%"    "M-VLDL-CE_%"  
## [117] "M-VLDL-FC_%"   "M-VLDL-TG_%"   "S-VLDL-PL_%"   "S-VLDL-C_%"   
## [121] "S-VLDL-CE_%"   "S-VLDL-FC_%"   "S-VLDL-TG_%"   "XS-VLDL-PL_%" 
## [125] "XS-VLDL-C_%"   "XS-VLDL-CE_%"  "XS-VLDL-FC_%"  "XS-VLDL-TG_%" 
## [129] "IDL-PL_%"      "IDL-C_%"       "IDL-CE_%"      "IDL-FC_%"     
## [133] "IDL-TG_%"      "L-LDL-PL_%"    "L-LDL-C_%"     "L-LDL-CE_%"   
## [137] "L-LDL-FC_%"    "L-LDL-TG_%"    "M-LDL-PL_%"    "M-LDL-C_%"    
## [141] "M-LDL-CE_%"    "M-LDL-FC_%"    "M-LDL-TG_%"    "S-LDL-PL_%"   
## [145] "S-LDL-C_%"     "S-LDL-CE_%"    "S-LDL-FC_%"    "S-LDL-TG_%"   
## [149] "XL-HDL-PL_%"   "XL-HDL-C_%"    "XL-HDL-CE_%"   "XL-HDL-FC_%"  
## [153] "XL-HDL-TG_%"   "L-HDL-PL_%"    "L-HDL-C_%"     "L-HDL-CE_%"   
## [157] "L-HDL-FC_%"    "L-HDL-TG_%"    "M-HDL-PL_%"    "M-HDL-C_%"    
## [161] "M-HDL-CE_%"    "M-HDL-FC_%"    "M-HDL-TG_%"    "S-HDL-PL_%"   
## [165] "S-HDL-C_%"     "S-HDL-CE_%"    "S-HDL-FC_%"    "S-HDL-TG_%"   
## [169] "VLDL-D"        "LDL-D"         "HDL-D"         "Serum-C"      
## [173] "VLDL-C"        "Remnant-C"     "LDL-C"         "HDL-C"        
## [177] "HDL2-C"        "HDL3-C"        "EstC"          "FreeC"        
## [181] "Serum-TG"      "VLDL-TG"       "LDL-TG"        "HDL-TG"       
## [185] "TotPG"         "TG/PG"         "PC"            "SM"           
## [189] "TotCho"        "ApoA1"         "ApoB"          "ApoB/ApoA1"   
## [193] "TotFA"         "UnSat"         "DHA"           "LA"           
## [197] "FAw3"          "FAw6"          "PUFA"          "MUFA"         
## [201] "SFA"           "DHA/FA"        "LA/FA"         "FAw3/FA"      
## [205] "FAw6/FA"       "PUFA/FA"       "MUFA/FA"       "SFA/FA"       
## [209] "Glc"           "Lac"           "Cit"           "Ala"          
## [213] "Gln"           "His"           "Ile"           "Leu"          
## [217] "Val"           "Phe"           "Tyr"           "Ace"          
## [221] "AcAce"         "bOHBut"        "Crea"          "Alb"          
## [225] "Gp"

Annotation list

Before saving the annotation files, put them in a list for ease of access.

annotation <- list(
  "clinic" = annotation_clinic, 
  "nightingale" = annotation_nightingale
)

Conclusions and take-homes

  • Use English language!
  • If you have data stored in an SPSS file, load that file directly into R; they contain a lot of useful information
  • Alternatively, load combinations of 1) data and 2) variable information
  • Create your study database in Excel or similar
  • You always need a properly formatted annotation file, or learn to use the R tools to extract meaningful annotation from structured data
  • Work in a consistent/structured manner; the annotation file from Nightingale was a good example of why that is the case

Save

Here we save those files that we will use in downstream work.

saveRDS(variables, file = "../data/processed/variables.rds")
saveRDS(annotation, file = "../data/processed/annotation.rds")
saveRDS(data_noma, file = "../data/processed/data_noma.rds")
saveRDS(data_clinic, file = "../data/processed/data_clinic.rds")
saveRDS(data_nightingale, file = "../data/processed/data_nightingale.rds")
saveRDS(data_comb, file = "../data/processed/data_comb.rds")

openxlsx::write.xlsx(annotation, file = "../data/processed/annotation.xlsx")
## Note: zip::zip() is deprecated, please use zip::zipr() instead

Session info

To improve reproducibility, print out the session info for this script.

devtools::session_info()
## - Session info ----------------------------------------------------------
## 
## - Packages --------------------------------------------------------------
##  package      * version  date       lib source        
##  assertthat     0.2.1    2019-03-21 [1] CRAN (R 3.6.0)
##  backports      1.1.4    2019-04-10 [1] CRAN (R 3.6.0)
##  broom        * 0.5.2    2019-04-07 [1] CRAN (R 3.6.0)
##  callr          3.3.0    2019-07-04 [1] CRAN (R 3.6.1)
##  cellranger     1.1.0    2016-07-27 [1] CRAN (R 3.6.0)
##  class          7.3-15   2019-01-01 [2] CRAN (R 3.6.0)
##  cli            1.1.0    2019-03-19 [1] CRAN (R 3.6.0)
##  colorspace     1.4-1    2019-03-18 [1] CRAN (R 3.6.0)
##  crayon         1.3.4    2017-09-16 [1] CRAN (R 3.6.0)
##  DBI            1.0.0    2018-05-02 [1] CRAN (R 3.6.0)
##  desc           1.2.0    2018-05-01 [1] CRAN (R 3.6.0)
##  devtools       2.1.0    2019-07-06 [1] CRAN (R 3.6.1)
##  digest         0.6.20   2019-07-04 [1] CRAN (R 3.6.1)
##  dplyr        * 0.8.2    2019-06-29 [1] CRAN (R 3.6.0)
##  e1071          1.7-2    2019-06-05 [1] CRAN (R 3.6.0)
##  ellipsis       0.2.0.1  2019-07-02 [1] CRAN (R 3.6.1)
##  evaluate       0.14     2019-05-28 [1] CRAN (R 3.6.0)
##  fansi          0.4.0    2018-10-05 [1] CRAN (R 3.6.0)
##  forcats      * 0.4.0    2019-02-17 [1] CRAN (R 3.6.0)
##  fs             1.3.1    2019-05-06 [1] CRAN (R 3.6.0)
##  generics       0.0.2    2018-11-29 [1] CRAN (R 3.6.0)
##  ggplot2      * 3.2.1    2019-08-10 [1] CRAN (R 3.6.1)
##  ggrepel        0.8.1    2019-05-07 [1] CRAN (R 3.6.1)
##  glue           1.3.1    2019-03-12 [1] CRAN (R 3.6.0)
##  gridExtra    * 2.3      2017-09-09 [1] CRAN (R 3.6.1)
##  gtable         0.3.0    2019-03-25 [1] CRAN (R 3.6.0)
##  haven        * 2.1.1    2019-07-04 [1] CRAN (R 3.6.1)
##  hms            0.5.0    2019-07-09 [1] CRAN (R 3.6.0)
##  htmltools      0.3.6    2017-04-28 [1] CRAN (R 3.6.0)
##  httr           1.4.0    2018-12-11 [1] CRAN (R 3.6.0)
##  inline         0.3.15   2018-05-18 [1] CRAN (R 3.6.1)
##  jsonlite       1.6      2018-12-07 [1] CRAN (R 3.6.0)
##  knitr        * 1.23     2019-05-18 [1] CRAN (R 3.6.0)
##  labeling       0.3      2014-08-23 [1] CRAN (R 3.6.0)
##  lattice        0.20-38  2018-11-04 [2] CRAN (R 3.6.0)
##  lazyeval       0.2.2    2019-03-15 [1] CRAN (R 3.6.0)
##  lifecycle      0.1.0    2019-08-01 [1] CRAN (R 3.6.1)
##  loo            2.1.0    2019-03-13 [1] CRAN (R 3.6.1)
##  lubridate      1.7.4    2018-04-11 [1] CRAN (R 3.6.0)
##  magrittr       1.5      2014-11-22 [1] CRAN (R 3.6.0)
##  Matrix         1.2-17   2019-03-22 [2] CRAN (R 3.6.0)
##  matrixStats    0.54.0   2018-07-23 [1] CRAN (R 3.6.1)
##  memoise        1.1.0    2017-04-21 [1] CRAN (R 3.6.0)
##  mitools        2.4      2019-04-26 [1] CRAN (R 3.6.1)
##  modelr         0.1.4    2019-02-18 [1] CRAN (R 3.6.0)
##  munsell        0.5.0    2018-06-12 [1] CRAN (R 3.6.0)
##  nlme           3.1-139  2019-04-09 [2] CRAN (R 3.6.0)
##  openxlsx     * 4.1.2    2019-10-29 [1] CRAN (R 3.6.1)
##  packrat        0.5.0    2018-11-14 [1] CRAN (R 3.6.1)
##  pheatmap       1.0.12   2019-01-04 [1] CRAN (R 3.6.0)
##  pillar         1.4.2    2019-06-29 [1] CRAN (R 3.6.0)
##  pkgbuild       1.0.3    2019-03-20 [1] CRAN (R 3.6.0)
##  pkgconfig      2.0.2    2018-08-16 [1] CRAN (R 3.6.0)
##  pkgload        1.0.2    2018-10-29 [1] CRAN (R 3.6.0)
##  plyr           1.8.4    2016-06-08 [1] CRAN (R 3.6.0)
##  prettyunits    1.0.2    2015-07-13 [1] CRAN (R 3.6.0)
##  processx       3.4.0    2019-07-03 [1] CRAN (R 3.6.1)
##  ps             1.3.0    2018-12-21 [1] CRAN (R 3.6.0)
##  purrr        * 0.3.2    2019-03-15 [1] CRAN (R 3.6.0)
##  R6             2.4.0    2019-02-14 [1] CRAN (R 3.6.0)
##  RColorBrewer   1.1-2    2014-12-07 [1] CRAN (R 3.6.0)
##  Rcpp           1.0.1    2019-03-17 [1] CRAN (R 3.6.0)
##  readr        * 1.3.1    2018-12-21 [1] CRAN (R 3.6.0)
##  readxl       * 1.3.1    2019-03-13 [1] CRAN (R 3.6.0)
##  remotes        2.1.0    2019-06-24 [1] CRAN (R 3.6.0)
##  reshape2       1.4.3    2017-12-11 [1] CRAN (R 3.6.0)
##  rlang          0.4.0    2019-06-25 [1] CRAN (R 3.6.0)
##  rmarkdown    * 1.13     2019-05-22 [1] CRAN (R 3.6.0)
##  rprojroot      1.3-2    2018-01-03 [1] CRAN (R 3.6.0)
##  rstan          2.19.2   2019-07-09 [1] CRAN (R 3.6.1)
##  rstudioapi     0.10     2019-03-19 [1] CRAN (R 3.6.0)
##  rvest          0.3.4    2019-05-15 [1] CRAN (R 3.6.0)
##  scales         1.0.0    2018-08-09 [1] CRAN (R 3.6.0)
##  sessioninfo    1.1.1    2018-11-05 [1] CRAN (R 3.6.0)
##  StanHeaders    2.19.0   2019-09-07 [1] CRAN (R 3.6.1)
##  stringi        1.4.3    2019-03-12 [1] CRAN (R 3.6.0)
##  stringr      * 1.4.0    2019-02-10 [1] CRAN (R 3.6.0)
##  survey         3.36     2019-04-27 [1] CRAN (R 3.6.1)
##  survival       2.44-1.1 2019-04-01 [2] CRAN (R 3.6.0)
##  tableone       0.10.0   2019-02-17 [1] CRAN (R 3.6.1)
##  testthat       2.1.1    2019-04-23 [1] CRAN (R 3.6.1)
##  tibble       * 2.1.3    2019-06-06 [1] CRAN (R 3.6.0)
##  tidyr        * 1.0.0    2019-09-11 [1] CRAN (R 3.6.1)
##  tidyselect     0.2.5    2018-10-11 [1] CRAN (R 3.6.0)
##  tidyverse    * 1.2.1    2017-11-14 [1] CRAN (R 3.6.1)
##  usethis        1.5.1    2019-07-04 [1] CRAN (R 3.6.1)
##  utf8           1.1.4    2018-05-24 [1] CRAN (R 3.6.0)
##  vctrs          0.2.0    2019-07-05 [1] CRAN (R 3.6.1)
##  withr          2.1.2    2018-03-15 [1] CRAN (R 3.6.0)
##  xfun           0.8      2019-06-25 [1] CRAN (R 3.6.0)
##  xml2           1.2.0    2018-01-24 [1] CRAN (R 3.6.0)
##  yaml           2.2.0    2018-07-25 [1] CRAN (R 3.6.0)
##  zeallot        0.1.0    2018-01-28 [1] CRAN (R 3.6.0)
##  zip            2.0.4    2019-09-01 [1] CRAN (R 3.6.1)
## 
## [1] C:/Users/jacobjc/Documents/R/win-library/3.6
## [2] C:/Program Files/R/R-3.6.0/library